C3 



ssaa. 
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Add new user-song associations in batches, allowing a significant period of time between each 
batch. 

5 Since the total that is in the denominator of all the p calculations will not change in between 
batches, that makes it possible, at the end of a batch load, to create a one-diminsional array to 
represent the p log p values, where the index is the numerator in the p calculation. Thus, each 
relevant p log p calculation only needs to be performed once, and is then reused. 

10 Instead of actually re-allocating memory for the array at the end of each batch load, the array can 
be zeroed out. A 0 in an element indicates that p log p has not yet been calculated. So, when a 
value is needed for p log p, the appropriate element is checked, and if it is 0, it is calculated. If it 
is non-zero, then the value that is there is used. 

M Appendix C 

\ll5 #VERSION 12 08/27/00 

tCopyright (c) 2000 by Virtual Development Corp. All Rights Reserved. 
= #Usage Notes########################## 

|=b20 # MimimumConvergencelterations in the Config file must be at least 1. (See BUGS.) 

# MinimumConvergencelterations "beats" MaxTime. It will run for the minimum 
Q1 # number of configurations, then run until MaxTime. 



# work_ = Work instance 

# rel_ = Relatable instance 

# clus_ = Cluster 
30 # clst_ = ClusterSet 

# clss_ = ClusterSetSignature 

import whrandom 
import math 
35 import xmllib 
import copy 
import time 
import ConfigParser 
import urllib 
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import sys 



# Utility stuff 

G_generator = whrandom . whrandom ( ) # For why global, see 
http: //starship .python.net/crew/donp/script/sample.py 
#G_generator . seed (1,1,3) 

def shuf f le (sample_size) : # See 

http : / /starship . python . net /crew/donp /script / sample . py 

'•'Moses and Oakford algorithm. See Knuth, vol 2, section 3.4.2. 
Returns a random permutation of the integers from 1 to 
sample_size . 



assert type { sample_size) == type(O) and sample_size > 0 

global G_generator 

list = ranged, sample_size + 1) 

for ix in xrange ( saiT\ple_size - 1, 0, -1): 

rand_int = G_generator . randint ( 0 , ix) 

if rand_int == ix: 
continue 

tmp = list[ix] 

list[ix] = list [rand_int] 

list [rand_int] = tmp 
return list 

# from http: //starship. python.net/pipermail/python-de/1997ql/ 00002 6 .html 
#" Converter module from strings to HTML entities" 

# The code is modified slightly modified to use the encodings 

# the python xml parser defaults to decoding, rather than using 

# htmlentitydef s . 

EntitiesByOrd={ ord('<') : 'It', 
ord{'>') : 'gf, 
ord ('&'): ' amp • , 
ord( • ) : 'quot ' , 
ord ("'"): ' apos ' } 

def toXML(s) : 
pos=start=0 
result=" " 
flush=0 

while pos<len(s) : 
c=ord(s [pos] ) 
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if EntitiesByOrd. has_key (c) : 
flush=l 

item="&"+EntitiesByOrd[c] +" ; " 
if flush: 
result=result+s [start :pos] +itein 
start=pos+l 
flush=0 
pos=pos+l 
result=result+s [start :pos] 
return result 

def computeEvenRankUnitRanks ( lstTup_input ): 

# SHOULD BE IN DATA object 

# Suppose 100 values are tied for second place, and 1 

# is alone in first. It should not be assuitibed that we 

# should put the lone value in the top percentile, because 

# it could easily be due to noise. So, we compromise by 

# saying there are 2 ranks, and we assign .25 to everyone in the low 

# and .75 to the one in the high. 

# We only use the first element in the tuple for ranking. 

# Output list has the same data as the input, but in 

# rank order, and each tuple has two extra elements 

# at the end: the integer rank (ties are counted as 

# the same rank; best is highest) and the unit rank. 

# FURTHER ADJUSTMENT DURING TIME OF LITTLE DATA!!!! If 

# there are two input sort field values, 1 and 2, the 

# original algorithm gives outputs .25 and .75. But that 

# still means that the low level is much closer to 0 

# than the high level is. That makes no sense. 

# So, we change the levels to .625 and .875. 



lstTup_input . sort ( ) 

assert lstTup_input [ 0 ][ 0 ] != None # logic assumes first sort value is not None 
lstTup_intermediate = [] 
int_rank = 0 

any_previousSortValue = None 
for tup_ in lstTup_input : 

if any_previousSortValue != tup_[ 0 ]: 
int_rank = int_rank + 1 
any_previousSortValue = tup_[ 0 ] 
lstTup_intermediate. append ( tup_ + ( int_rank, )) 

lstTup_output = [] 
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for tup_ in lstTup_intermediate : 

float_ = ( tup_[ -1 ] - 0.5 ) / float ( int_rank ) 
float_tuning = Conf ig . f loat_tuningRankBottom + float_ * ( 1.0 - 
Conf ig. f loat_tuningRankBottom ) #see note above for little data 
lstTup_output. append ( tup_ + ( f loat_tuning, ) ) 

return lstTup__output 

def computeAverageUnitRanks ( lstTup_input ): 

# NOT USED IN CURRENT CODE 8/24/00 

# The first element in the tuple is the only one used 

# in the ranking. 

# The output list contains tuples identical to the input 

# list but with an added element at the end, which is 

# the ranking, with dups assigned to the average ranks 

# of the dups. 

def isLastlnDupSet ( int_index, lstTup_ ): 
if len( lstTup_ ) == int_index + 1: 

return 1 
else : 

if lstTup_[ int_index ][ 0 ] != lstTup_[ int_index + 1 ][ 0 ]: 

return 1 
else : 

return 0 

float_offset = 1.0 / ( 2.0 * len( lstTup_input )) 

lstTup_input . sort ( ) 

lstTup_output = [] 

int_startDupIndex = 0 

int_limit Index = len( lstTup_input ) 

lst_currentDupSet = [] 

for int_index in range ( int_limit Index ): 

if isLastlnDupSet ( int_index, lstTup_input ): 

lst_currentDupSet. append ( lstTup_input [ int_index ] ) 

# Compute average unit rank 

f loat_averageRank = ( int_index + int_s tar tDup Index ) / 2.0 

f loat_averageUnitRank = float_offset + f loat_averageRank / int_limit Index 

# Add to output list 

for tup_ in lst_currentDupSet : 

lstTup_output. append ( tup_ + ( f loat_averageUnitRank, )) 

# Set the stage for next iteration 
int_startDupIndex = int__index + 1 
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lst_currentDupSet - [] 
else: 

lst_currentDupSet . append ( lstTup_input [ int_index ]) 
return lstTup_output 



# Classes 
class Config: 

# When an instance is created, the class attributes are populated; 

# at that point, the instance itself can be thrown away. 

15 str_clusterFile = None 

str_useFile = None 

str_oldUseFile = None 

int_createClusterCount = None 

f loat_maxTime = None 
O20 int_minimumConvergenceIterations = None 

J3 str_outClusterFile = None 

"''•i f loat_tuningRankBottom = None 

f loat_tuningZeroWeight = None 

''^25 C str configFile = 'clusterconfig.txt' 

^[f C_str_sectionName = 'Configuration' 

C_str_clusterFile = ' InClusterFile ' 

C_str_useFile = 'UseFile' 

C_str_oldUseFile = 'OldUseFile' 

Lj,30 C_str_createClusterCount = ' CreateClusterCount ' 

r5", C str maxTime = 'MaxTime' 

ya - - 

C_str_minimumConvergenceIterations = "MiniitiumConvergencelterations 
f] C_str_outClusterFile = ' OutClusterFile ' 

C_str_tuningRankBottom = ' TuningRankBottom ' 
35 C_str_tuningZeroWeight = ' TuningZeroWeight ' 

def init ( self ) : 

configParser = Conf igParser . Conf igParser () 
conf igParser .read( Conf ig .C_str_conf igFile ) 
40 Config. str_clusterFile = conf igParser , get ( Conf ig . C_str_sectionName, 

Conf ig.C_str_clusterFile ) 

Config. str_useFile - conf igParser . get ( Conf ig .C_str_sectionNaine, 
Conf ig.C_str_useFile ) 

Conf ig. str_oldUseFile = conf igParser . get ( Conf ig .C_str_sectionName, 
45 Config.C_str_oldUseFile ) 

Conf ig. int_createClusterCount = int ( conf igParser . get ( Conf ig.C_str_sectionNaine, 
Conf ig.C_str_createClusterCount ) ) 
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Conf ig. f loat_maxTime = float ( conf igParser . get ( Conf ig.C_str__sectionName, 
Conf ig . C_str_maxTime )) 

Conf ig . int_minimumConvergenceIterations = int ( conf igParser . get ( 
Conf ig.C_str_sectionName, Conf ig.C_str_minimumConvergence Iterations ) ) 
5 Conf ig. f loat_tuningRankBottom = f loat ( conf igParser . get ( Conf ig. C_str_sectionName, 

Conf ig.C_str_tuningRankBottom ) ) 

Conf ig. f loat_tuningZeroWeight = float { conf igParser . get ( Conf ig . C_str_sectionName, 
Conf ig.C_str_tuningZeroWeight )) 

Conf ig. str_outClusterFile = conf igParser . get ( Conf ig . C_str_sectionName , 
10 Config.C_str_outClusterFile ) 

class Data: 

# This is a singleton. One instance is created, and that creates everything. 

15 

# "Longnames" are of the format "Beatles - Hey Jude" . The artist and the title 
separate by 

# spacedashspace. Each Work object is uniquely identified by a Longname. 

£520 singleton = None 

def init ( self ): 

^" assert not self. class .singleton 

self. class .singleton = self 

'-25 self .die tStrDictStrNone_userLongname = {} 

-^f self . dictStrDictStrFloat._longname2LongnamelUnitRank = { } 

self .die tLongnameWork_ = {} 
: . self .die tStrDictStrInt_longnamelLongname2Count = {} 

self . diets trInt_longnameUniqueCount = {} 
^30 self .lstWork_ = [] 

01 

f% assert Conf ig . str_useFile 

rj print 'about to read data' 

self. readUserPlayStats ( Conf ig . str_useFile ) 

35 print ' about to generate use counts ' 

self . generateUseCounts ( ) 

print 'about to generate unit ranks' 

self . generateUnitRanks ( ) 

40 def displayCheekingInf o ( self ): 

dict_russians = self . dictStrDictStrFloat_longname2LongnamelUnitRank [ 'Sting - 

Russians' ] 

lst_russians = dict_russians . items ( ) 
lst_russians . sort ( ) 

45 

def getWorks ( self ): 
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return self . lstWork_ 

def getUnitRanks { self ): 

assert self . dictStrDictStrFloat_longnaine2LongnamelUnitRank 
return self . dictStrDictStrFloat_longname2LongnainelUnitRank 

def getAssociatedLongnames { self, str_longname ): 

assert self . dictStrDictStrFloat_longname2LongnamelUnitRank. has_key { str_longname ) 
return self . die tStrDictStrFloat_longname2LongnamelUni tRank [ str_longname ].keys() 




def readUserPlayStats ( self, str_f ileName ): 

if str_f ileName [ :7 ] == "http://": 

fil_ = urllib.urlopen(str_f ileName) 
else : 

fil_ = open (str_f ileName, ' r ' ) 
str_ = fil_,read() 
fil_. close ( ) 

class UseListContainerParserl ( xmllib. XMLParser ): # Embedded class, only used 
here ! 

# THIS LOGIC ASSUMES UNIQUENESS AT USER/SONG LEVEL IN THE INPUT XML FILE!! 

def init ( self, data_ ) : 

self . str_currentUser = None 

self.data_ = data_ 

xml lib. XMLParser . init ( self ) 

def start_entry( self, dict_ ) : 

# str_work is the title of the work, which must be distinguished from Work 
objects ! 

if ( self . str_currentUser != 'mike3k@mail.com' 
and self .str_currentUser != 'jake@jspace.org' 
and self. str_currentUser != ' jake@braincase . net ' ): 
if int( dict_[ 'count' ] ) > 1: 

str_artist = intern ( dict_[ ' artist ' ] ) 
str_work = intern ( dict_[ 'work' ] ) 

str_longname = intern( '%s - %s' % ( str_artist, str_work )) 

dict_ = self .data_. diets trInt_longnameUniqueCount 
if dict_.has_key ( str_longname ): 

dict_[ str_longname ] = dict_[ str_longname ] + 1 
else : 

dict_[ str_longname ] = 1 



def start_useList ( self, dict_ ): 
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self . str_currentUser = dict_[ 'user' ] 

class UseListContainerParser2 ( xml lib. XML Parser ): # Embedded class, only used 
here 1 

def init ( self, data_ ) : 

self . str_currentUser = None 

self .data_ = data_ 

xmllib .XMLParser . init ( self ) 

def start_entry( self, dict_ ): 

# str_work is the title of the work, which must be distinguished from Work 
objects ! 

str_artist = intern ( dict_ [' artist * ] ) 
str_work = intern ( dict_[ 'work' ] ) 

str_longname = intern ( ' %s - %s ' % ( str_artist, str_work )) 

if ( self .data_. diet StrInt_longnameUniqueCount.has_key{ str_longnaine ) and 
self .data_. diets tr Int_longnameUniqueCount [ str_longname ] > 1 ): 
if self . data„. diets trDictS trNone_userLongname .has_key( self . str_currentUser ): 
if self . data_. diets trDictStrNone_userLongname [ self . str_eurrentUser 
] .has_key( str_longname ) : 



str_longname ] = None 
else : 

self .data_.dietStrDietStrNone_userLongname[ self . str_currentUser ] = { 
str_longname : None } 

if not self .data_. die tLongnameWork_.has_key( str_longname ) : 
work_ = Work( str_longname, str_artist, str_work ) 
self .data_. Is tWork_. append ( work_ ) 

self .data_. die tLongnameWork_[ str_longname ] = work_ 

def start_useList ( self, dict_ ): 

self . str_currentUser = dict_[ 'user' ] 

parser_l = UseListContainerParserl ( self ) 
par ser_l . feed ( str_ ) 
parser_l .close ( ) 

parser_2 = UseListContainerParser2 { self ) 
par ser_2 . feed ( str_ ) 
parser_2 . close ( ) 

def generateUseCounts ( self ): 

dietStrDietStrInt_longnamelLongncime2Count = {) 



pass 



# Already there ! 



else : 

self .data_. diets trDietStrNone_userLongname[ self . str_eurrentUser ] [ 
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lstStr_user =self . dictStrDictStrNone_userLongname . keys ( ) 

int_loopCount = 0 

for str_user in lstStr_user: 

int_loopCount = int_loopCount + 1 

int_innerLoopCount = 0 

sys . stdout . flush ( ) 

for str_longnamel in self . dictStrDictStrNone_userLongname [ str_user ].keys(): 
int_innerLoopCount = int_innerLoopCount + 1 

# print 'deep in loop, int_innerLoopCount , ' of ' , 
len(self .dictStrDictStrNone_userLongname[ str_user ]) 

for str_longname2 in self . dictStrDictStrNone_userLongname [ str_user ].keys(): 

# if str_longnamel != str_longname2 : songs played by only 1 user can still be 
clustered due 

# to the user's other choices... 
not counting cases 

# where the two are equal would 
eliminate them, and 

# should cause logic that loops 
through all of the songs 

^ looking for unitRanks to fail 

if str_longnamel != str_longname2 : 

if dictStrDictStrInt_longnamelLongname2Count.has_key ( str_longnamel ) : 
if dictStrDictStrInt_longnamelLongname2Count [ str_longnamel ] .has_key< 
str_longname2 ) : 

dictStrDictStrInt_longnamelLongname2Count [ str_longnamel ] [ str_longname2 

] = \ 

dictStrDictStrInt_longnamelLongname2Count [ str_longnamel ] [ 

str_longname2 ] +1 
else: 

dictStrDictStrInt_longnamelLongname2Count [ str_longnamel ][ str_longname2 

] = 1 

else : 

dictStrDictStrInt_longnamelLongname2Count [ str_longnamel ] = { 

str_longname2 : 1 } 

self .dictStrDictStrInt_longnamelLongname2Count = 
dictStrDictStrInt_longnamelLongname2Count 



def generateUnitRanks ( self ): 

# "Unit ranks" are ranks scaled down to the unit interval. For instance, the lowest 

# rank out of 57 elements is 0, and the highest is 56/57 = .98245614035. But, we 

# also perform averaging, so ranks that extreme should be unusual. 

# Consider longnamel to be a work "associated" with longname2 . Longname2 is the 
work 

# for which we are generating a profile; this profile involves the 

associated 

# Longnamel works. 
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# That is, a profile for a longname2 would contain all 

# the longnamel ' s that are associated with it. For each associated work, considered 
across all 

# main works, there is one rank for each main work, 

# that's where the uniform distribution comes from. The alternative would be: for 
each main work have 

# one rank for each associated work; then some associated works would NECESSARILY 
have very low rank. 

# In contrast, using the approach presented, all associated works CAN have high 
rank but under 

# the null hypothesis the distribution would be uniform. 

self . dictStrDictStrFloat_longname2LongnamelUnitRank = {} 

for str_longnamel in self .die tStrDictStrInt_longnamelLongname2Count. keys () : 
lstTupIntStr_ = [] 

dictStrInt_longname2Count = self . dictStrDictStrInt_longnamelLongname2Count [ 
str_longnamel ] 

for str_longname2 in dictStrInt_longname2Count . keys ( ) : 



lstTupIntStr_. append ( ( dictStrInt_longname2Count [ str_longname2 ], str_longname2 



if str_longnamel == 'Elton John - Levon' : 

lstTupIntStr_. sort ( ) 
lstTupIntStrIntFloat_ = computeEvenRankUnitRanks { lstTupIntStr_ ) 
for int_ in range ( len( lstTupIntStrIntFloat_ )): 

tupIntStrIntFloat_ = lstTupIntStrIntFloat_ [ int_ ] 

float_ = tupIntStrIntFloat_[ -1 ] 

str_longname2 = Is tTupIntStrIntFloat_[ int_ ] [ 1 ] 

if self .diets trDictStrFloat_longname2LongnamelUnitRank.has_key( str_longname2 ): 
self .dictStrDictStrFloat_longname2LongnamelUnitRank[ str_longname2 ] [ 
str_longnamel ] = float_ 



self .dictStrDictStrFloat_longname2LongnamelUnitRank[ str_longname2 ] = { 
str_longnamel : float_ } 

# fil_.close() 

# computeAverageUn it Ranks 

class Relatable: 

def getName( self ) : 
assert 0 

def getAssociatedRelatedness ( self, str_otherName ): 
assert 0 

def getAssociatedLongnames { self ): 
assert 0 



) ) 



else : 
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def getOverallRelatedness ( self, rel_ ): 

f loat_zeroWeight = Conf ig . f loat_tuningZeroWeight 

f loat_sum = 0.0 

f loat_divisor =0.0 

for str_name in self . getAssociatedLongnames ( ) : 

float_other = rel_. getAssociatedRelatedness ( str_name ) 
if float_other == None: # Defensive programming 

float_other =0.0 
if float_other == 0: 

float_weight = f loat_zeroWeight 
else : 

f loat_weight = 1.0 
f loat_divisor = f loat_divisor + float_weight 

float_self = float { self . getAssociatedRelatedness ( str_name )) # Cast is 
defensive programming 

f loat_product = float_self * float_other * float_weight 
float_sum = float_sum + f loat_product 
if float_divisor : 

f loat_overallRelatedness = float_sum / f loat_divisor 
else : 

f loat_overallRelatedness = 0.0 
return f loat_overallRelatedness 

class Work( Relatable ) : 

# The xml attribute 'work' is the title of the work, which must be distinguished from 
Work objects, 

# which contain artist info as well as title info! 

def init ( self, str_longname, str_artist, str_work ): 

# The "Longname" of the work, for purposes of this program, is the artist + the 
work title. 

Data. singleton. getAssociatedLongnames ( str_longname ) 
self .str_longname = str_longname 
self .str_artist = str_artist 
self . str_work = str_work 

def getName ( self ): 

return sel f . str_longname 

def getArtist( self ): 
return self . str_artist 

def getAssociatedRelatedness ( self, str_longname ): 

dictStrDictStrFloat_longname2LongnamelUnitRank = Data . singleton. getUnitRanks { ) 
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dict_ = dictStrDictStrFloat_longname2LongnamelUnitRank #Using intermediate name 
ust for clarity 

assert dict_. has_key ( self . str_longname ) 

if dict_[ self . str_longname ] .has_key( str_longname ): 

f loat_unitRank = dict_[ self . str_longname ][ str_longname ] 
else: 

f loat_unitRank = 0.0 
return f loat_unitRank 

def getAssociatedLongnames ( self ): 

return Data . singleton . getAssociatedLongnames { str_longname ) 

lass Cluster ( Relatable ): 

# To understand this class, it's important to understand the difference between a 

# cluster's membership list and its profile. Both of them involve a group of 

# objects subclassed from Relatable. But the membership list (self . lstRel_member) 

# determines the objects that are currently members of a cluster; whereas, the 

# profile (self .dictStrFloat_longnameRelatedness) is a description of the current 

# "center" of the cluster for purposes of measuring the distance between the 

# cluster and an object that is a candidate for membership in the cluster. 

# Normally, all candidate objects are assigned to a cluster before the profile 

# is computed; these assignments are based on the old profiles. For instance, 

# when clusters are being generated for the first time, the old prof iles ^are 

# random. When clusters are being regenerated based on old clusters read from 

# an xml disk file, the profiles from the disk file clusters are used as the 

# old profiles . 

str_nextAutomaticName = ' 1 ' 

def init ( self, str_name=None ) : 

self . lstRel_member = [] 

self .diets trFloat_longnameRelatedness = {} 
if str_name: 

self . str_name = str_name 
else : 

int_ = int( self. class . str_nextAutomaticName ) 

self . str_name = self. class . str_next Automat icName 

self. class . str_next Automat icName = str( int_ + 1 ) 

def getName( self ) : 
return self . str_name 

def getMembers ( self ): 
return self . lstRel__member 

def getAssociatedRelatedness ( self, str_longname ): 
# 1 or 0 
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if self .dictStrFloat_longnameRelatedness .has_key ( str_longname ): 
return self . diets trFloat_longnameRelatedness [ str_longname ] 

else : 

return 0 . 0 

def getCountUniqueArtist ( self ): 
if not self . lstRel_meinber : 
return 0 

assert self . lstRel_meinber [ 0 ]. class == Work 

dict_ = {} 

for work_ in self . lstRel_member : 

dict_[ work_.getArtist ( ) ] = None 
return len( dict_ ) 

def getAssociatedLongnames ( self ): 

return self. dictStrFloat_longnameRelatedness . keys { ) 

def addToCluster ( self, rel_ ): 
self . Is tRel_meinber . append ( rel_ ) 

def addToProf ile { self, strLongname ): 

# Used for initializing empty profile for later clustering, 
self .dictStrFloat_longnameRelatedness [ strLongname ] = None 

def computeClusterProf ile { self, bool_binary ): 

# Normally, relatedness of each member to the cluster is binary -- 

# 1 if it's in the diet, 0 otherwise- However, in the final 

# cluster eonfergence, it makes sense to do a 2-stage profile computation; 

# first we compute the binary values (represented by membership in 

# the diet vs. non-membership), then, using those values, we recompute 

# the profile, generating floating point values. This allows 

# us, in the final convergence, to generage clusters in such 

# a way that the most remote profile elements don't hold as great a sway 

# over what potential members are attracted to the cluster. 

# WHILE REVIEWING THIS CODE FOR WORK ON CLUSTERS13 , I NOTICED THAT THIS 

# APPARENTLY SHOULD BE STRUCTURED AS: IF BOOL_BINARY . . . ELSE . THIS WOULD 

# AVOID SETTING dictStrFloat_longnameRelatedness TWICE, AS APPARENTLY 

# HAPPENS WITH THE CURRENT CODE. NOT CHANGING NOW BECAUSE AM WORKING 

# ON NEW VERSION AND DO NOT. EXPECT TO TEST CHANGES. 

for rel_ in self . lstRel_member : 
if rel_. class == Work: 

self .die tStrFloat_longnameRelatedness [ rel_. getName ( ) ] =1.0 
elif rel_. class Cluster: 

lstStr_otherName = rel_. getAssociatedLongnames { ) 

for str_otherName in lstStr_otherName: 
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self .dictStrFloat_longnaineRelatedness [ str_otherName ] =1.0 
else: 

assert 0 # Attempt to cluster an illegal class 

if not bool_binary: 

for rel_ in self . lstRel_meinber : 

if rel_. class == Work: 

self . diets trFloat_longnameRelatedness [ rel_. getName { ) ] = 
self .getOverallRelatedness ( rel__ ) 

elif rel_. class == Cluster: 

lstStr_otherName = rel_. getAssociatedLongnames ( ) 
for str_otherNaitie in lstStr_otherName : 
self .dictStrFloat_longnameRelatedness [ str_ot her Name ] = 
self .getOverallRelatedness ( rel_ ) 
else: 

assert 0 # Attempt to cluster an illegal class 

def makeEmpty( self ) : 

# Notice that it leaves the profile ( self . dictStrFloat_longnameRelatedness ) intact 
for purposes 

# of getAssociatedRelatednessO and getAssociatedLongnames () . 
self . lstRel_member = [] 

def merge ( self ): 

# Turns a cluster of clusters (each of which must contain works) 

# into a cluster of works 

lstWork_ = [] 

for clus_ in self . lstRel_member : 

assert clus_. class == Cluster 

for work_ in clus_. getMembers ( ) : 

assert work_. class == Work 

lstWork_. append ( work_ ) 
self . lstRel_member = lstWork_ 

class ClusterSet: 

def init ( self, str_f ileName=None, lstClus_persistent=None, 

int_randomClusterCount=None ) : 

# The constructor just loads or creates the clusters, it doesn't 

# do any processing. 

# When constructing from a file, the clusters 

# have profiles for measuring relatedness, but have no members. 

# When constructing from a list of clusters, they keep their members. 

# Randomly generated clusters are given members, 
self .lstClus_ = [] 
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if str_f ileName : 
readUserPlayStats ( str_f ileName ) 

elif int_randoinClusterCount : 

lstWork_ = Data. singleton. getWor ks {) 
int_countWorks = len( lstWork_ ) 
lstlnt_shuf f led = shuffle ( int_countWorks ) 

if int_countWorks < int_randomClusterCount : # Obviously only applicable in small 



int_randomClusterCount = int_countWorks 
int_numberOfRandomWorksPerCluster = int_countWorks / int_randomClusterCount 
clus_current = None 

for int_ in xrange ( int_countWorks ) : 

if int_ % int_numberOfRandomWorksPerCluster == 0: 
if clus_current : #Skip first iteration 

clus_current .computeClusterProf ile ( bool_binary=l ) 
clus_current = Cluster () 
self .addToClusterSet ( clus_current ) 
clus_current.addToCluster ( lstWork_[ lstlnt_shuf f led [ int_ ] - 1 ] ) 
clus_current. computeClusterProf ile ( bool_binary=l ) # May end up doing this 
twice for a cluster 
else : 

assert lstClus_persistent 
self.lstClus_ = lstClus_ 

def consolidateArtists { self ): 

# Move all works for a given artist to the cluster with the greatest 

# concentration of works for that artist. 

# This may not be necessary in implementations where can do all clustering at 
artist level. 

dictStrDictClusInt_artistClusterCount = {} 

dict_ = dictStrDictClusInt_artistClusterCount # short handle 

for clus_ in self . lstClus_: 
lstWork_ = clus_.getMembers ( ) 
for work_ in lstWork_: 



str_artist = work_. getArtist ( ) 
if dict_.has_key ( str_artist ) : 

if dict_[ str_artist ] .has_key( clus_ ): 

dict_[str_artist ] [ clus_ ] = dict_[ str_artist ] [ clus_ ] + 1 

else: 

dict_[ str_artist ] [ clus_ ] = 1 
else: 

dict_[ str_artist ] = { clus_ : 1 } 



tests . 
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dictStrClus_artistBestCluster = {} 

for str_artist in dict_ . keys ( ) : 
clus_bestCluster = None 
int_bestCount = 0 
for tupClusInt_ in dict_[ str_artist ]. items (): 
if tupClusInt_[ 1 ] > int_bestCount : 
int_bestCount = tupClusInt_[ 1 ] 
clus_bestCluster = tupClusInt_[ 0 ] 
dictStrClus_artistBestCluster [ str_artist ] = clus_bestCluster 

for clus_ in self . lstClus_: 
clus_ . makeEmpty ( ) 

dictStrLstWork_artistWork = {} 
for work_ in Data. singleton. getWorks () : 
str_artist = work_. getArtist ( ) 

if dictStrLstWork_artistWork.has_key { str_artist ): 

dictStrLstWork_artistWork[ str_artist ] .append ( work_ ) 
else : 

dictStrLstWork_artistWork[ str_artist ] = [ work_ ] 

for tupStrClus_ in dictStrClus_artistBestCluster . items () : 
str_artist = tupStrClus_[ 0 ] 
clus_ = tupStrClus_[ 1 ] 

for work_ in dictStrLstWork_artistWork [ str_artist ] : 
clus_.addToCluster ( work_ ) 

for clus_ in self . lstClus_: 
clus_. computeClusterProf ile( ) 

def getAverageSquaredUniqueArtist ( self ): 
int_sum = 0 

for clus_ in self . lstClus_: 

int_count = clus_. getCountUniqueArtist ( ) 
int_sum = int_sum + int_count**2 . 0 

return float ( int_sum ) / len( self . lstclus_ ) 

def getAverageCountUniqueArtist ( self ): 
int_sum = 0 

for clus_ in self . lstClus_: 

int_count = clus_. getCountUniqueArtist ( ) 
int_sum = int_sum + int_count 

return float ( int_sum ) / len( self . lstClus_ ) 
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def getMaxCountUniqueArtist { self ): 
int_max = 0 

for clus_ in self . lstClus_: 

int_count = clus_. getCountUniqueArtist ( ) 

if int_count > int_max: 
int_max = int_count 
return int_max 

def getMinCountUniqueArtist ( self ): 

int_min = len( Data . singleton . getWorks () ) 
for clus_ in self . lstClus_: 

int_count = clus_. getCountUniqueArtist { ) 
if int_count < int_min: 
int_min = int_count 
return int_min 

def getMaxClusterSize ( self ): 
int_maxSize = 0 

for clus_ in self . lstClus_: 

int_size = len( clus_. getMembers ( ) ) 
if int_size > int_maxSize : 
int_maxSize = int_size 

return int_inaxSize 

def getSignature ( self ): 

# Returns a dictionary which is a signature of the cluster 

# Convenient since diets can be tested for equality, don't need identity 
dictStrDictStrNone_longnameLongname = {} 

for clus_ in self . lstClus_ : 

str_clusterNaine = clus_. getName ( ) 

dictStrDictStrNone_longnameLongnaine[ str_clusterName ] = {} 
for str_associatedLongname in clus_. getAssociatedLongnames ( ) : 

dictStrDictStrNone_longnameLongname[ str_clusterName ][ str_associatedLongname ] 

= None 

return dictStrDictStrNone_longnameLongname 

def performClustering{ self, lstRel_item, bool_recluster=0 , bool_binary=l ): 

# bool_recluster means recluster items that are already clustered. 

# For defensive programming, we copy the list object (nothing in the list is 
copied) 

# so that, when we add to the list below, it doesn't have side effects 

# for calling methods which expect the list to be unmodified 
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lstRel_itemToCluster = copy. copy { lstRel_item ) 

if bool_recluster : 

for clus_ in self . lstClus_: 

for rel_ in clus_. getMembers ( ) : 

lstRel„itemToCluster. append ( rel_ ) 

for clus_ in self . lstClus_: 

clus_.makeEmpty ( ) # Leaves profile intact 

for rel_ in lstRel_itemToCluster : 

f loat_bestRelatedness =0.0 # default to no correlation 

clus_best = None 

for clus_ in self . lstClus._ : 

f loat_currentRelatedness = clus_.getOverallRelatedness ( rel_ ) 
if f loat_currentRelatedness > f loat_bestRelatedness : 
f loat_bestRelatedness = f loat_currentRelatedness 
clus_best = clus_ 
if f loat_bestRelatedness: # IF 0 DOES NOT GO INTO A CLUSTER!! 

clus_best .addToCluster ( rel_ ) 

clus_.computeClusterProfile( bool_binary ) # Prepare the cluster 

center for use in further correlation 



def convergeClusters ( self, f loat_latestTiine, int_miniraumIterations , bool_binary=l ): 
# f loat_latestTiine is latest time to start an iteration 

f loat_currentTime = time . time ( ) 

dict_oldSignature = None 

int_iterations = 0 

bool_done = 0 

while not bool_done: 

if int_iterations < int_minimumIterations or f loat_currentTime <= 
f loat_latestTime : 

print 'iterating:*, int_iterations 

self .performClustering ( [] , bool_recluster=l , bool_binary=bool_binary ) 
dict_newSignature = self . getSignature {) 
if dict_newSignature == dict_oldSignature : 

print 'finishing convergence due to unchanged signatures' 

bool_done = 1 
else : 

dict_oldSignature = dict_newSignature 
f loat_currentTime = time . time ( ) 
int_iterations = int_iterations + 1 
else: 

print ' finishing due to timeout ' 
bool_done = 1 



58 



Express^^ EL 453 889 575 US 
m&vmy Docket R49-009 



def merge ( self ) : 

for clus_ in self . lstClus_: 
clus_. merge ( ) 

def getClusters { self ) : 
return self . lstClus_ 

def addToClusterSet ( self, clus_ ): 
self .lstClus_. append ( clus_ ) 

def readUserPlayStats { self, str_f ileName ): 

# We do not put members into the clusters, we only populate the profiles. 

self .lstClus_ = [] 

fil_ = open (str_f ileName, 'r') 

str_ = fil_.read() 

fil_. close ( ) 

class ClusterParser ( xmllib . XML Parser ): # Embedded class, only used here! 

def init ( self, clst_ ) : 

self.clst_ = clst_ 

self . clus_current = None 

xmllib.XMLParser . init ( self ) 

def start_member ( self, dict_ ) : 

str_artist = intern( dict_ [' artist ' ] ) 
str_title = intern ( dict_[ 'work' ] ) 
tupStrStr_artistTitle = ( str_artist, str_title ) 
str_longname = intern( ' %s - %s ' % tupStrStr_artistTitle ) 
self .clus_current .addToProf ile ( str_longname ) 

def start_cluster ( self, dict_ ): 

self .clus_current = Cluster( dict_[ 'name' ] ) 
clst_. Is tClus_. append ( clus_current ) 

parser_ = ClusterParser ( self ) 
parser_. feed( str_ ) 
parser_, close ( ) 

def writeToDisk( self, str_f ileName ) : 
fil_ = open( str_f ileName, 'W ) 

fil_.write( '<?xml version= " 1 . 0 " encoding="ISO-8859-l" ?>\n' ) 

fil_. write ( <ClusterContainer xmlns :xsi = "http: / /www .w3 . org/ 1999 /XMLSchema- 
instance" 

xsi :noNamespaceSchemaLocation= ' ViewListContainer .xsd' >\n" " " ) 
fil_. write ( ' <clusters medium= "music " >\n ' ) 
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# 

for clus_ in self . lstClus_: 

fil_. write ( ' <cluster naine=" %s " >\n ' % clus_. getName ( ) ) 

lstTup_ = [] 

for work_ in clus_. getMembers { ) : 

f loat_relatedness = clus_. getOverallRelatedness ( work_ ) 

tup_ = ( f loat_relatedness, toXML{ work_. str_artist ), toXML ( work_. str_work )) 

1 s tTup_ . append ( tup_ ) 
lstTup_. sort ( ) 
lstTup_. reverse ( ) 
for tup_ in lstTup_: 

f il_. write ( ' <member artist="%s" work="%s" relatedness= " %s " />\n' % 

( tup_[ 1 ], tup_[ 2 ], tup_[ 0 ])) 

fil_.write( ' </cluster>\n ' ) 

f il_. write { ' </clusters>\n ' ) 
f il_. write ( ' </ClusterContainer>\n ' ) 
fil_. close ( ) 



###################################################################################### 
######## 

# SCRIPT LOGIC 

try: 

ConfigO # Get configuration data 
DataO # Create data singleton 

if Conf ig. int_createClusterCount : 

# See http://www.math.tau.ac.il/~nin/learn98/idomil/ 

int_nuinberOf Clusters = int ( Conf ig . int_createClusterCount * math. log ( 
Conf ig. int_createClusterCount )) 

f loat_maxTime = time. time () + Conf ig. f loat_maxTime 

float_mostFabulous = f loat ( len( Data. singleton . getWorks () ) * len( 
Data.singleton.getWorks{ ) ) ) 

while time. time 0 < f loat_maxTime : 

f loat_maxTimel = ( f loat_maxTime - time . time () ) * . 33 time . time ( ) 
f loat_maxTime2 = ( f loat_maxTime - time.timeO) *.66 + time.timeO 
f loat_maxTimel = ( f loat_maxTime - time.timeO) * . 50 + time.timeO 
f loat_maxTime2 = ( f loat_maxTime - time.timeO) *1.0 time.timeO 
print 'In outer loop ######' 
print 'about to make cluster set* 

clst_l = ClusterSet{ int_randomClusterCount=int_numberOf Clusters ) 
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print 'about to perforin first clustering' 

clst_l .performClustering ( [], 1 ) 

print 'about to perform first convergence' 

clst_l . convergeClusters ( f loat_maxTimel, Con f ig . in t_minimuinConver gene alterations 

) 

lstClus_l = clst_l .getClusters 0 

clst_2 = ClusterSet( int_randomClusterCount=Conf ig . int_createClusterCount ) # A 
set of clusters of clusters 

print 'about to perform second clustering' 

clst_2 . performClustering ( lstClus_l, 0 ) # Make clusters of clusters 

print ' about to merge ' 

els t_2 .merge ( ) # Change from clusters of clusters to clusters of works 

print 'about to perform second convergence' 

clst_2 . convergeClusters ( f loat_maxTime2 , Conf ig . int_minimumConvergence Iterations 

) 

clst_2 . performClustering ( [], 1, bool_binary=0 ) 
print 'about to perform third convergence' 

clst_2 . convergeClusters ( f loat_maxTimel , Conf ig . int_minimumConvergenceIterations , 
bool_binary=0 ) 

f loat_f abulousness = clst_2 . get Aver ageSquaredUniqueArtist ( ) 

print 'max unique: ' , clst_2 . getMaxCountUniqueArtist ( ) , ' min unique:', 
clst_2 .getMinCountUniqueArtist ( ) 

print ' avg unique:', clst_2 . getAverageCountUniqueArtist ( ) , ' fabulousness:', 
float_f abulousness 

if float_f abulousness < f loat_mostFabulous : 
fil_ = open ( ' tuninginf o . txt ' , 'w') 

f il_.wri te( ' float_tuningRankBottom: ' + str( Conf ig . f loat_tuningRankBottom ) + 

'\n') 

f il_.write{ ' f loat_tuningZeroWeight: ' + str( Conf ig . f loat_tuningZeroWeight ) + 

•\n' ) 

fil_.write( • float_f abulousness: ' + str( float_f abulousness ) + '\n') 

f il_. write {' clst_2 .getMaxCountUniqueArtist () : ' + str ( 
clst_2 .getMaxCountUniqueArtist ( ) ) + '\n') 

f il_. write {' clst_2 .getMinCountUniqueArtist () : ' + str ( 
clst_2 .getMinCountUniqueArtist ( ) ) + '\n') 

f il_. write (' clst_2 .getAverageCountUniqueArtist () : ' + str( 
clst_2 .getAverageCountUniqueArtist ( ) ) + ' \n' ) 

fil_. close { ) 

print 'ttttFOUND NEW BEST###' 

print 'writing intermediate' 

f loat_mostFabulous = float_f abulousness 

clst_2 . writeToDisk( ' intermediate .xml ' ) 

clst_best = clst_2 

elif Conf ig. str_clusterFile: 

clst_cluster = ClusterSet( str_f ileName=Conf ig . str_clusterFile ) 
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clst_cluster .performClustering( Data . singleton . getWorks () , 0 ) 
clst_actual.convergeClusters ( Conf ig . f loat_rtiaxTiine + time . time () , 
Conf ig. i n t_minimumConvergenceIt era t ions ) 
else : 

assert 0, 'Invalid config file option' 
clst_best.writeToDisk( Conf ig . str_outClusterFile ) 
print ' done ! ' 
except Exception, str_: 
print 'ERROR' 
print str_ 

print ' \n\nPress any key to abort:' 
sys . s tdin . read ( 1 ) 
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